{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pprint import pprint\n",
    "import seaborn as sns\n",
    "import sklearn\n",
    "import matplotlib.pylab as plt\n",
    "\n",
    "# To suppress FutureWarnings\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
    "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 1: Hypothesis Testing with an A/B test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Suppose we work at a large company that is developing online data science tools. Currently the tool has interface type A but we'd like to know if using interface tool B might be more efficient.\n",
    "To measure this, we'll look at length of active work on a project (aka project length).\n",
    "We'll perform an A/B test where half of the projects will use interface A and half will use interface B."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 2 columns):\n",
      "lengths_A    1000 non-null float64\n",
      "lengths_B    1000 non-null float64\n",
      "dtypes: float64(2)\n",
      "memory usage: 15.7 KB\n"
     ]
    }
   ],
   "source": [
    "# read in project lengths from '../data/project_lengths'\n",
    "# there should be 1000 observations for both interfaces\n",
    "df_project = pd.read_csv('../data/project_lengths.csv')\n",
    "df_project.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1.5819526645395978"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the difference in mean project length between interface A and B\n",
    "# for consistency, subtracting A from B\n",
    "# hint: this number should be negative here (could interpret as faster)\n",
    "mean_A = df_project.lengths_A.mean()\n",
    "mean_B = df_project.lengths_B.mean()\n",
    "observed_mean_diff = mean_B - mean_A\n",
    "observed_mean_diff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we'll perform a permutation test to see how significant this result is\n",
    "# generate 10000 random permutation samples of mean difference\n",
    "# hint: use np.random.permutation\n",
    "rand_mean_diffs = []\n",
    "n_samples = 10000\n",
    "combined_times = np.concatenate([df_project.lengths_A.values, df_project.lengths_B.values])\n",
    "n_A = sum(df_project.lengths_A.notnull()) # number of observations for page A\n",
    "for i in range(n_samples):\n",
    "    rand_perm = np.random.permutation(combined_times)\n",
    "    rand_mean_A = rand_perm[:n_A].mean()\n",
    "    rand_mean_B = rand_perm[n_A:].mean()\n",
    "    rand_mean_diffs.append(rand_mean_B-rand_mean_A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3Xl0nHd97/H3d0a7RrtGshbLsi3JS5zYiZ09zp6SQBYoBBKgQKFNe9oc4PZyCy0lhRQohVNaLqQ95JLQ0OIsBAIOccjqLCaObTl24kWWrVi2JGvfd41G871/SE6EI1sjeUbPLN/XOT7WSI9nPsfWfPzo9/ye309UFWOMMbHF5XQAY4wxoWflbowxMcjK3RhjYpCVuzHGxCArd2OMiUFW7sYYE4Os3I0xJgZZuRtjTAyycjfGmBiU4NQL5+fna3l5uVMvb4wxUWn37t2dquqd7TjHyr28vJzq6mqnXt4YY6KSiBwP5jgbljHGmBhk5W6MMTHIyt0YY2KQlbsxxsQgK3djjIlBQZW7iNwoIrUiUiciX5nh658RkQ4R2Tv1689CH9UYY0ywZp0KKSJu4D7gBqAJ2CUim1X14CmHPqqqd4chozHGmDkK5sz9IqBOVY+qqg94BLgtvLGMMcacjWDKvQRonPa4aepzp/qwiLwlIo+LyOKQpDPGGDMvwZS7zPC5U3fVfhIoV9XzgOeBh2Z8IpG7RKRaRKo7OjrmltSExdVXX83VV1/tdAxjTIgFU+5NwPQz8VKgefoBqtqlqmNTD/8fsH6mJ1LV+1V1g6pu8HpnXRrBGGPMPAWztswuoFJElgIngDuAj08/QESKVLVl6uGtQE1IUxoTJpt2NAR97McvLgtjEmNCa9ZyV1W/iNwNPAO4gQdV9YCI3AtUq+pm4PMicivgB7qBz4QxszFho6q09o9S2zrAsG+CiYCSluSmLDeNB7fVk5Lo/oPjrfBNpApqVUhV3QJsOeVz90z7+O+AvwttNGMW1r4TfTx3sI3OwTEESHS7cLlgbDyAAi6BtaXZbKz0sigrxem4xpyRY0v+GhMpfP4AT+1rZtexHoqzUrhtXTHnFGfhSZ58e4yOT9DUM0JNaz+7j/Wwp7GXC8qyuWVtscPJjTk9K3cT18b8E/zk1XpO9I5wZaWXG1YX4nb94QSxlEQ3FQUeKgo8XLeygFePdPLK4Q4auke4ZFkeq4oyHUpvzOlZuZu4cerF04Aqj+9uorl3hE9eXMbq4qxZnyMtKYH3nbOIygIPj1Y38sf/8Rr//bmL2FCeG67YxsyLLRxm4tYLNe0caO7n/ecWBVXs0y3zerj7mgqKslL405/uYv+JvjClNGZ+rNxNXDrSNsDW2nY2LMnhsuV583qOjJREPrK+FLdb+OiPt/OD54+waUfDnKZXGhMuVu4m7vgDAZ58q5m89CRuXVuMyEw3YQcnOy2Jz12+FAF+vuM4Pn8gdEGNOQtW7ibuvFbXReegj5vPKybBffZvgTxPMh/dsJiOgTF++1bz7H/AmAVg5W7iSv/IOC/WtrNyUQYrFmWE7HkrCzO4aoWX6uM97GnoCdnzGjNfVu4mrjx3sI1AQPnAuUUhf+7rVhZSnpfO5jebaekbCfnzGzMXVu4mbvQO+9jT2MOFS3PJ8ySH/PndLuEj60sJqPIPT+xH9dTFU41ZOFbuJm5sq+sEYGNFftheIzc9iRtWFfLCoXaefKtl9j9gTJhYuZu40D3kY9exbtYtziY7LSmsr3VZRT5rS7P4xuYD9Az5wvpaxpyOlbuJCw+9dozxCWVjZfj3EXCJ8J0Pn0fPsI/vP3c47K9nzEys3E3MG/FN8ND2Y6wqyqQwc2FWc1xVlMknL1nCz3ccp6alf0Fe05jprNxNzHvyzWZ6h8e5Ioxj7TP5mxuqyExN5BtPHrCLq2bBWbmbmPc/O45TVeihPC9twV5z044Gtuxr5cpKL68f7ebvn9i/YK9tDNiqkCbGvdXUy1tNfXzj1nPOapmB+bqwPJcd9V08d7CV1UWZ71lO2HZyMuFiZ+4mpv389QZSE9186IISR17f7RJuWFVI56CPvY29jmQw8cnK3cSsvpFxNr/ZzG3rislMSXQsx6qiTEqyU3nxUBv+gC0sZhaGlbuJWb/ec4KR8Qk+cfESR3OICDesLqRneJzdx23dGbMwrNxNzHp0VyPnFGdybuncNuIIh8oCD0ty09h6qN3O3s2CsAuqJuZs2tFAc+8IB1v6uWVtcURsniEiXLOygP967Rj7mvo4vyzH6UgmxtmZu4lJ1cd7SHAJ60qznY7yjsoCDwUZyfy+rtPmvZuws3I3MWd8IsCbjb2sLs4kNcntdJx3iAiXL8+nuW+U+q4hp+OYGGflbmLOwZZ+RsYn2LAk1+ko77GuLJu0JDe/r+tyOoqJcVbuJubsPt5Ddloiy7zpTkd5j0S3i4uX5nKopZ+uwTGn45gYZuVuYkpTzzBvtw+yviwHlwN3pAbj4mV5iMCuYzYt0oSPlbuJKb+obgJg/ZLInY2SmZJIVWEGext7mAjYhVUTHlbuJmZMBJTHdzdRUeAJ+4YcZ+uCshz6R/3v7A5lTKhZuZuYsa2ukxO9IxF91n7SykUZpCa6eXx3k9NRTIyycjcx47FdjeSkJbK6KNPpKLNKcLtYuzibZw600jcy7nQcE4Os3E1M6B7y8ezBVj54fgkJ7uj4tl5floPPH+Ap20jbhEFQ7wIRuVFEakWkTkS+cobjPiIiKiIbQhfRmNk9secE4xPKxy5c7HSUoBVnp1BV6OFXb9jQjAm9WctdRNzAfcBNwGrgThFZPcNxGcDngR2hDmnMmagqj+1qZG1pFisXRf6QzEkiwi3nFVN9vIe2/lGn45gYE8yZ+0VAnaoeVVUf8Ahw2wzH/RPwXcC+S82CerOpj9q2AT52YfTtanTTuYsAeOZAq8NJTKwJptxLgMZpj5umPvcOETkfWKyqvw1hNmNmtWlHA996qoZEtzA6PhERK0DORUVBBhUFHrbss3F3E1rBlPtMt/m9c+eFiLiAfwP+96xPJHKXiFSLSHVHR0fwKY05DZ8/wFtNvZxbkkVKYuQsEjYX71+ziJ313bYcgQmpYMq9CZh+laoUaJ72OANYA7wkIseAS4DNM11UVdX7VXWDqm7wer3zT23MlP0n+hjzB1gfgYuEBevGNUUEFJ492OZ0FBNDgin3XUCliCwVkSTgDmDzyS+qap+q5qtquaqWA68Dt6pqdVgSGzNN9fFu8j1JlOelOR1l3lYVZbAkL42n99u4uwmdWctdVf3A3cAzQA3wmKoeEJF7ReTWcAc05nSOdgxyrGuY9UtykQhdJCwYIsKNaxbxWl0nfcN2Q5MJjaDmuavqFlWtUtXlqvqtqc/do6qbZzj2ajtrNwvhseomXAIXlEXObkvz9b5zFuEPKC8fsWtRJjRsD1UTlfwTAX75RhMrCjPISEl0Os68nZzdE1AlLcnNT7fVMzjq5+MXR9+0ThNZouM+bWNOsbW2g46BMTaUR++F1OlcIlQVZlDbNkDA9lc1IWDlbqLSo7sa8GYkU1WY4XSUkKkqzGDYN8GJnhGno5gYYOVuok77wChbazv48AWluF3ReyH1VFUFHgSobRtwOoqJAVbuJuo88cYJJgLKRzeUOh0lpNKSE1icm8ZhK3cTAnZB1USNTTsaUFV+8mo9S3LTeP1ot9ORQq6qMIPna9roGBjDm5HsdBwTxezM3USVxp4ROgbHomK3pflYuWjyGsIrh21KpDk7Vu4mquw+3kOiWzi3JMvpKGFRlJVCRkoCW2vbnY5iopyVu4ka0xcJS47SRcJmI1NTIl853IF/IuB0HBPFrNxN1DjQPLlI2AUxOiRz0orCDPpH/exp7HU6ioliVu4mauxu6CE3PYmleelORwmrigIPCS5h6yEbmjHzZ+VuokJj9zBHO4a4oCwnqhcJC0ZKopsN5TlsrbWLqmb+rNxNVHh8dxNCbCwSFoxrVhRQ09JPa5/tWmnmx8rdRLxAQHl8dxPLCzxkpyU5HWdBXLOyAICXbNaMmScrdxPxth/t4kTvSMzObZ9JZYGHkuxUmxJp5s3K3US8X1Q3kpmSwOqiTKejLBgR4eoVXrYd6cTntymRZu6s3E1EGxrz87sDrdyytphEd/x8u27a0YAgDPkm+O7vDr2z7rsxwYqfd4uJSs/XtDE6HuCD55c4HWXBLfem4xI43DbodBQThazcTUT7zd5mirNSWF8WP+PtJyUnulmSl86Rdlsl0sydlbuJWD1DPl453MEt64pxxdC67XNRVZhBS98o/aO2cbaZGyt3E7G27G/BH1BuWxt/QzInVRZ4AKizoRkzR1buJmL9Zm8zFQUeVhXFzlZ6c1WUlUJGcgKHbWjGzJGVu4lILX0j7Kzv5ra1xTG/3MCZiAiVhR7q2geZCNjG2SZ4Vu4mIm3Z1wrAzWuLHU7ivMqCyY2z953oczqKiSK2zZ6JOJt2NPCz7cdYlJnC9re72P52l9ORHFUxtXH2y7UdrFscH2vrmLNnZ+4m4vSPjtPQNcyakvi5I/VM0pMTKMlJ5eXDthSBCZ6Vu4k4B5v7UeCc4tjcSm8+qgoz2NvYS9+wTYk0wbFyNxFn/4k+vBnJFGamOB0lYlQWeAgobKvrdDqKiRJW7iaidA2OUd85xJpiG5KZrjQnjcyUBBuaMUGzcjcR5dmDbSiwpsSGZKZzu4SNlV5eOdyJqk2JNLOzcjcR5ZkDreSmJ7HIhmTe48qqfFr7R20hMRMUmwppIsbgmJ/X6rq4aGluXN+4dDrdQ5MXU//9+cNsrPQC8PGLy5yMZCKYnbmbiPHq4Q58EwFWxvFyA2eSlZpIYWYyh9tsKQIzu6DKXURuFJFaEakTka/M8PW/FJF9IrJXRLaJyOrQRzWx7rmDbWSnJbIkN93pKBGrqiCDY13DtjuTmdWs5S4ibuA+4CZgNXDnDOW9SVXPVdV1wHeB74c8qYlp/okAL9a2c+2KAtxxurxvMCoLM5gIKEc7bdzdnFkwZ+4XAXWqelRVfcAjwG3TD1DV/mkP0wG7nG/mZPfxHnqHx7lhdaHTUSLakrw0Et1iF1XNrIK5oFoCNE573ARcfOpBIvLXwN8AScC1IUlnYt7JvUG37GvB7RJa+0ZJTnQ7nCpyJbpdLMv3cMTG3c0sgjlzn+ln5Pecmavqfaq6HPgy8A8zPpHIXSJSLSLVHR0dc0tqYpaqUtPSz3JvuhV7ECoLPXQN+egaHHM6iolgwZR7E7B42uNSoPkMxz8CfHCmL6jq/aq6QVU3eL3e4FOamNYxOEbXkI+Vi+yu1GBUFU7OJjrSbkMz5vSCKfddQKWILBWRJOAOYPP0A0SkctrDDwBHQhfRxLra1skhhpWLbApkMPLSk8hNT7IpkeaMZh1zV1W/iNwNPAO4gQdV9YCI3AtUq+pm4G4RuR4YB3qAT4cztIktta0DFGYmk52W5HSUqCAiVBZ42NPQi88fICnBblcx7xXUHaqqugXYcsrn7pn28RdCnMvEidHxCY51DXFFhQ3TzUVVYQY76rupPt7NZcvznY5jIpD9l28cVdc+SEBhhQ3JzMmy/HTcIrx82CYmmJlZuRtH1bYOkJropiw3zekoUSU50c2SvDRerrVyNzOzcjeOCQSU2rYBKgs9dlfqPFQWZnCodYC2/lGno5gIZOVuHLO/uY/BMT8rCm1IZj6qCj0AvGJDM2YGVu7GMS8eakeYPAM1c7coMwVvRrKNu5sZWbkbx2w91E5pTiqeZNtWYD5EhKuqvGyr62QiYMs5mT9k5W4c0TEwxptNfaywu1LPypVVXnqHx3mrqdfpKCbCWLkbR7xUO7nRs02BPDsbK/IRwYZmzHtYuRtHbK1tpyAjmeIs2yv1bOSkJ7G2NNvK3byHlbtZcOMTAV493Mk1Kwpsr9QQuLLKy5uNvfQO+5yOYiKIlbtZcNXHehgY83PNygKno8SEq6q8BBS21XU6HcVEECt3s+C21raT6BauqLQ1UUJhbWkWWamJdreq+QNW7mbBvXionYuW5toUyBBJcLu4ojKfV450oGpTIs0kK3ezoI53DVHXPsh1K22v1FC6qtJLW/8YtbbGu5li5W4W1As1k1Mgr1tl4+2hdGXV5JLJNjRjTrKfi82CevFQO8u96SzJS3c6Skw4ucE4TC5H8Gh1IxkpiXz84jIHU5lIYOVuFsyD2+rZ/nYXl1Xk/UEpmdCoLPDw2ttd+PwBp6OYCGDDMmbBHGkfZELVNsIOk8rCDCZUqe+0jbONlbtZQLWt/bYxRxgtyUsjwSUcabdyN1buZoFMBJRDrQNU2cYcYZPodrE0P93K3QBW7maB7G3sZdg3wcoiG5IJp8oCDx0DYzT3jjgdxTjMyt0siBdq2nAJVBXYKpDhVDG18cm2I7YUQbyzcjcL4sVD7SzJSyc1ye10lJhWmJFMRkoCrxyx+e7xzsrdhF1TzzCHWgdYaWu3h52IUFngsd2ZjJW7Cb8XD03elWpTIBdGRUEGvcPjHGjuczqKcZCVuwm7F2raWZqfjjcj2ekocaGiwAPAK7aBR1yzcjdhNTTmZ/vbXVxra7cvGE9yAmtKMnnFLqrGNSt3E1bb6jrxTQRsobAFtrHSyxvHexgc8zsdxTjE1pYxYfX8wTYyUhK4sDyXY53DTseJGz5/AH9A+ZenD7Fq6t4CW0wsvtiZuwkb/0SA52vauHZlAYlu+1ZbSEty00h021IE8czecSZsdh3roWd4nBvPWeR0lLiT4HaxLN9DXbtt3hGvrNxN2DxzoJXkBBdXrfA6HSUuVRR46Bz00TPkczqKcUBQY+4iciPwA8AN/ERVv3PK1/8G+DPAD3QAn1XV4yHOaqLEph0NqCpP7DnBcq+HX+9pdjpSXKqcmhJZ1z7IhUtzHU5jFtqsZ+4i4gbuA24CVgN3isjqUw7bA2xQ1fOAx4HvhjqoiS4nekfoGxnnnGK7cckp3oxkslITOWJDM3EpmGGZi4A6VT2qqj7gEeC26Qeo6lZVPTkV4nWgNLQxTbQ50NyPS+yuVCeJCBUFHuo6BgmoLUUQb4Ip9xKgcdrjpqnPnc7ngKfPJpSJbqrKgeY+lnk9tlCYwyoLPIyOB2jqsSWA400wY+4z7aww42mAiHwS2ABcdZqv3wXcBVBWZnNuY1Vr/yidgz4ur8h3Okrcq/B6ELChmTgUzJl7E7B42uNS4D1XyETkeuCrwK2qOjbTE6nq/aq6QVU3eL02gyJW7WvqwyVwTnGW01HiXlpyAiU5qdS12Xz3eBNMue8CKkVkqYgkAXcAm6cfICLnAz9mstjbQx/TRAtVZd+JySEZT7LdAB0JKgo8NPYM0z867nQUs4BmLXdV9QN3A88ANcBjqnpARO4VkVunDvse4AF+ISJ7RWTzaZ7OxLgDzf10Dfk4t8TO2iNFZUEGAYXtb3c5HcUsoKBOrVR1C7DllM/dM+3j60Ocy0SpJ99qnhySsb1SI8bi3FSSEly8eqSD99ndwnHD7lA1IaOqPPVWCxUFHtJsSCZiJLhcLMtP51VbAjiuWLmbkNnb2EtTzwjnlmQ7HcWcorLAw/GuYY51DjkdxSwQK3cTMr/Z20yS28VqG5KJOCumbiZ74ZDNd4gXVu4mJMYnAjz5ZjPXrSqwG5ciUG56ElWFHp4/2OZ0FLNArNxNSLxyuIOuIR9/fIGtPBGprltVyK5j3fSN2JTIeGDlbkLiV3tOkJOWyFVVdnNapLp+VQH+gPKybZwdF6zczVnrGxnnuYNt3Lq2mKQE+5aKVOsW55CXnmRDM3HC3onmrD29rwWfP2BDMhHO7RKuWVnAS7XtjE8EnI5jwszK3Zy1X71xgmXedM4rtbtSI931qwroH/VTfazH6SgmzKzczVn50Yt17DzWTYXXw8M7G9m0o8HpSOYMNlZ6SUpw8ezBVqejmDCzcjdnZW/j5Bng2sV241I0SE9O4MpKL0/vayUQsA08YpmVu5k3VWVPQy9L89PJSUtyOo4J0i1ri2jtH+WNBhuaiWVW7mbe9jT20jXk44IyO2uPBpt2NLBpRwPdgz4SXMK/PnfY6UgmjKzczbw98cYJElxim3JEmeREN1WFGew/0ceEDc3ELCt3My8+f4An32pmdXEmKYm23EC0Obc0i4FRP9XHup2OYsLEyt3My/M1bfQOj3P+4hyno5h5WLkogwSX8NS+FqejmDCxcjfz8siuRoqzUqgs9DgdxcxDcoKbFYsy2LKvxW5oilFW7mbOGruHefVIB7dvWIxLxOk4Zp7OX5xD56CPl2ttrZlYZOVu5uwX1Y0A3L7BlhuIZisWZZDvSebRqX9PE1us3M2cTASUx6qbuLLSS2lOmtNxzFlwu4QPX1DCi4faaR8YdTqOCTErdzMnLx9up7V/lDsvWux0FBMCt29YzERA+fWeE05HMSFmuxiboJxcM+Zn24+RnpxA+8CYrSMTAyoKPKxfksOjuxr5843LELuGEjPszN0ErXvIR23rABeV55Dgsm+dWPHRDaW83THE7uO2HEEssXeoCdrrR7sQgYuW5jkdxYTQzecVk5GSwH+9dszpKCaErNxNUHz+ANXHuzmnOIus1ESn45gQSk9O4M6Lynh6fysnekecjmNCxMrdBGVvYy+j4wEuXWZn7bHo05eVA/AzO3uPGVbuZlaqyvajnRRlpbAkz6Y/xqKS7FRuXLOITTsbGBrzOx3HhICVu5nVS4c7aOsf47Ll+TabIoZ97oqlDIz637lJzUQ3K3czq//YWkd2aiLrbLelmHZBWQ4XlGXzk231tt5MDLByN2e0s76bXcd62FiZj9tlZ+2x7u5rK2jqGeGJN+ympmhnNzGZM7pvax35niQ2lOc6HcWEwak3oqkqJdmp/GhrHR+6oIREt53/RSv7lzOnta+pj5cPd/DZK5bamzxOiAjXriygoXvYliSIcvaONaf13WcOkZ2WyCcvWeJ0FLOAVi7KYHVRJj/aWoffxt6jVlDlLiI3ikitiNSJyFdm+PqVIvKGiPhF5COhj2kW2rYjnbx6pJO7r6kgM8VuWoonIsIXr6/keNcwv9jd5HQcM0+zjrmLiBu4D7gBaAJ2ichmVT047bAG4DPAl8IR0iysQED556drKMlO5U8utbP2eNQxMEZZbhrffqqGsfEASQkuPn5xmdOxzBwEc+Z+EVCnqkdV1Qc8Atw2/QBVPaaqbwH2M1wMePKtZg409/Ol91WRnGCbX8cjEeGmNYsYGPOzra7T6ThmHoIp9xJg+l0NTVOfmzMRuUtEqkWkuqPDtvaKRCO+Cb77u1pWFWVy29p5/TObGLEkL53VRZm8cqSDQbtrNeoEU+4zTW7W+byYqt6vqhtUdYPX653PU5gQa+8fo71/cm32TTsa+Kufv8GJ3hG+fstqXDavPe790TmF+CcCvHio3ekoZo6CKfcmYPq2O6VAc3jiGCd1DY7x6pEObltXzMW2QJgBCjJS2FCey876Luo7h5yOY+YgmHLfBVSKyFIRSQLuADaHN5ZxwlP7WnC5hL9//yqno5gIct3KAhJcLr73zCGno5g5mHW2jKr6ReRu4BnADTyoqgdE5F6gWlU3i8iFwBNADnCLiHxDVc8Ja3ITUoda+jnUOsBNaxbxQo39CG7elZGSyBWV+WzZ18qehh7OL8txOpIJQlDz3FV1i6pWqepyVf3W1OfuUdXNUx/vUtVSVU1X1Twr9uiiLje/3deC15PMpcttOMa818aKfPI9SXx7Sw2q87rkZhaY3aFq8JdfSveQj1vWFtveqGZGyYlu/tcNVew61sNT+1qcjmOCYO/kOBdIyWJ86eWsKcmiosDjdBwTwe64sIxVRZl8+6kaRnwTTscxs7Byj3PjK24AlPevWeR0FBPh3C7hG7eeQ3PfKP/58ttOxzGzsCV/49jLhzuYKFxJ4pEXyU5b73QcE+FOLg98XmkW/7G1jiS3i9z0JFuWIELZmXucGvNP8PXNB5ChLhKO7XA6jokiN60pwuUSnnyz2S6uRjAr9zj1wLZ66juHSDr0DKI2fmqCl5WayA2rCqltG2B/c7/TccxpWLnHoebeEX74Qh03rC7E3XXU6TgmCl2yLI/i7BR++2Yz/aPjTscxM7Byj0PfeqqGgCr33Lza6SgmSrldwofWlTI45uc7T9udq5HIyj3ObDvSyVP7WvjraypYnJvmdBwTxUpyUrm8Ip9NOxp4+bCt8hpprNzjiM8f4J7N+1mSl8ZdVy5zOo6JATesLqSywMPfPv4mvcM+p+OYaazc48iDv6/naMcQX7/lHFISbRMOc/YS3S7+7WPr6Br08bXfHHA6jpnGyj1O/OdLb/P9Zw+zqiiTlr7Rd9ZvN+ZsrSnJ4gvXVfLkm808Vt04+x8wC8LKPU5s2ddCQJWbzy1yOoqJQX91TQWXLc/ja7/ez0GbHhkRrNzjwLYjnew70cfVK7zkpCc5HcfEILdL+L93nk92WiJ/9fPdNj0yAli5x7gR3wRf+81+ctOT2FhpWxua0Ds5xPfsgTY+uK6Ehu5hvvDwHiYCdveqk6zcY9x3nq6hvnOIPz6/hES3/XOb8FqSl84ta4vZWtvBN5866HScuGYLh8WwbUc6eWj7cT57+VKWeW05X7MwLl6aR156Mg/+vp6l+el86tJypyPFJTuVi1G9wz7+z+Nvstybzt/euMLpOCbOfPUDq7huZQFf33yAp21zD0dYuccg/0SAuzftoWvQx799bJ3NaTcL7tFdjWys9FKak8bdD+/h3idtiGahWbnHoH9++hDb6jr55ofWcF5pttNxTJxKSnDx6UvL8XqS+Z/Xj1N9rNvpSHHFyj3GPLKzgQe21fOZy8r56IbFTscxcS41yc2fXl5OZmoCn3pwJzuOdjkdKW5YuceQx3Y18ndP7KOywMNyr+edKWp2J6pxUkZKIn+2cRnF2al85qe7+H1dp9OR4oKVe4x4ZGcDX/7VW2ys9PLJS5bgdonTkYx5R2ZKIg//+SWU5abxmZ/u5Nd7TjgdKeZZuUc5/0SAb/72IF/51T42Vnq5/0/W23x2E5G8Gck89heXckFZDl98dC8/fOGIbdMXRjbPPYq19I3wxUf2sqO+m09fuoSvfmA1SQlW7CYynRz31Q39AAAIeUlEQVQe/MC5RYz5A/zrc4d5al8LD//5JbYsRhhYuUch/0SAh7Yf5/vP1uKbCHD7+lJWLMrk8d1NTkczZlYJbhe3ry+lNCeVp/e3ctMPXuV7t59ny2OEmJV7FAkElN8daOXfnz/M4bZBrqrycmF5Lrl21mOijIhw2fJ8luSl8/S+Fv7kgZ3cfF4RX7t5NYWZKU7HiwlW7lFg2OfnN3ub+env6zncNshybzr/+YkLuHHNIh7eaetnm+hVkp3Kli9s5McvH+W+l+p4oaadT122hL+4crmdtJwlK/cItGlHA6pKU88Iexp72dvYw+h4gEWZKXxsw2LOLc2iZ3jcit3EhF+9cQJvRjKfv7aS52vauP/lo/zP9uN8ZH0pn7qsnOW2LtK8WLlHmKaeYbbWtrOnoYfOQR8JLmF1cSaXLM1jSV4aIjbF0cSm3PQkPrphMVdXeWnoHubhnY08tP04Gyvz+cxl5Vy9osCm+M6BlXsE6BsZ53f7W/jlGyfYWT95i3Z5XjpXVnpZU5Jla8OYuFKQmUJBZgorFmWw61gPO+u7+NxD1ZTmpPKh80u4bV0xFQUZTseMeFbuDmnqGeb5g208V9PGjqPd+APKsvx0vvRHVQhiU8NM3MtISeTalQVcVeXlYEs/TT3D3Le1jh++WMdybzrXrSrkykovaxdnkZGS6HTciBNUuYvIjcAPADfwE1X9zilfTwZ+BqwHuoCPqeqx0EaNXqpKc98oexp62NPQy2tvd1HTMrnPpDcjmcuW53NOcSalOak27GLMKdwu4dySLM4tyeLyinz2n+jjUMsAD7xaz/2vHEUEKgs8nL84h/PLsllVlEl5fjpZqfFd+LOWu4i4gfuAG4AmYJeIbFbV6Wt4fg7oUdUKEbkD+BfgY+EIHMlUlY6BMeo7hzjeNUx91xB17YPsbeylY2AMgOQEF+sWZ/PV96/i+tWFbH/bFlIyJliZKYlctjyfy5bnMzo+QUP3MI3dwzT2DLP5zWYerX53kkFeehJL89NZmp9OeX46y/LTKctLozQ7jczUhJg/kQrmzP0ioE5VjwKIyCPAbcD0cr8N+PrUx48DPxIR0QW+t1hVCSgEVAmoolMf+wPK6PgEY+MBRscnGB0PMDI+MfXxBKP+yc+PTwQY9wfYUd/NRED/4FdFoYeJwORzTUxM/j405qd3xEfv8Dh9I+N0D/kY8wfeyeMSyE1PpjQnlUuW5VGWk8airJR3LgpZsRszfymJbqoKM6gqnBx/V1W6hny094/SOeijc3CMriEftW0DDIz6/+DPpie5Kc5OpTg7lXxPMtlpiWSlvvsrOcFFgttFgltIdLlIdAsJ7qnfpz1OcAlJCZO/n/p1p//zCKbcS4Dpc+6agItPd4yq+kWkD8gDQr782wPb6vnXZ2uZCLxb3oGpUg8Hl0z+WLi3sZcEt+B2Tf0SwR9Q0pLcpCYlUJyVynKvh5z0JPLSk8j3JJOVmmhX941ZICJCvieZfE/ye742Nj5B15CP7iEfvSPj9A1P/l7XPkhd+yC9wz6GfBMhz+SSyVwnfxfAJcI/3rKaOy4qC/nrTRdMuc/UTqdWaTDHICJ3AXdNPRwUkdogXj/c8gnDf0ILIKS5P3HJklA91ZnY3/XCicbMEJ2555z5zm/CnfN/vaDerMGUexMwfdeHUqD5NMc0iUgCkAW8Z9sVVb0fuD+YYAtFRKpVdYPTOeYqGnNHY2aIztzRmBmiM3ekZg5mCcFdQKWILBWRJOAOYPMpx2wGPj318UeAFxd6vN0YY8y7Zj1znxpDvxt4hsmpkA+q6gERuReoVtXNwAPAf4tIHZNn7HeEM7QxxpgzC2qeu6puAbac8rl7pn08Ctwe2mgLJqKGieYgGnNHY2aIztzRmBmiM3dEZhYbPTHGmNhj2/YYY0wMsnIHROSfROQtEdkrIs+KSLHTmYIhIt8TkUNT2Z8QkWynM81GRG4XkQMiEhCRiJthMJ2I3CgitSJSJyJfcTpPMETkQRFpF5H9TmcJlogsFpGtIlIz9b3xBaczBUNEUkRkp4i8OZX7G05nms6GZQARyVTV/qmPPw+sVtW/dDjWrETkj5icmeQXkX8BUNUvOxzrjERkFRAAfgx8SVWrHY40o6llNw4zbdkN4M5Tlt2IOCJyJTAI/ExV1zidJxgiUgQUqeobIpIB7AY+GAV/1wKkq+qgiCQC24AvqOrrDkcD7MwdgJPFPiWdGW7AikSq+qyqnryv+nUm70GIaKpao6qRcPPabN5ZdkNVfcDJZTcimqq+wgz3mEQyVW1R1TemPh4Aapi86z2i6aTBqYeJU78ipjus3KeIyLdEpBH4BHDPbMdHoM8CTzsdIobMtOxGxBdOtBORcuB8YIezSYIjIm4R2Qu0A8+pasTkjptyF5HnRWT/DL9uA1DVr6rqYuDnwN3Opn3XbLmnjvkq4Gcyu+OCyRwFglpSw4SOiHiAXwJfPOWn6YilqhOquo7Jn5ovEpGIGQqLm806VPX6IA/dBDwF/GMY4wRtttwi8mngZuC6SLkreA5/15EsmGU3TIhMjVn/Evi5qv7K6Txzpaq9IvIScCMQERez4+bM/UxEpHLaw1uBQ05lmYupTVS+DNyqqsNO54kxwSy7YUJg6sLkA0CNqn7f6TzBEhHvyRlqIpIKXE8EdYfNlgFE5JfACiZncRwH/lJVTzibanZTyz0kM7n7FcDrkT7LR0Q+BPwQ8AK9wF5VfZ+zqWYmIu8H/p13l934lsORZiUiDwNXM7lSYRvwj6r6gKOhZiEiVwCvAvuYfA8C/P3UnfERS0TOAx5i8vvDBTymqvc6m+pdVu7GGBODbFjGGGNikJW7McbEICt3Y4yJQVbuxhgTg6zcjTEmBlm5G2NMDLJyN8aYGGTlbowxMej/A+Q4fLG0CJjsAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# use seaborn to plot the distribution of mean differences\n",
    "# use plt.vlines to plot a line at our observed difference in means\n",
    "_ = sns.distplot(rand_mean_diffs)\n",
    "_ = plt.vlines(observed_mean_diff,0,0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0236"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the plot should seem to indicate significance, but let's calculate a one-tailed p_value using rand_mean_diffs\n",
    "p_value = sum(rand_mean_diffs < observed_mean_diff) / len(rand_mean_diffs)\n",
    "p_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0903782039609031"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can calculate the effect size of our observation\n",
    "# this is the absolute value of the observed_mean_diff divided by the standard deviation of the combined_times\n",
    "observed_effect_size = np.abs(observed_mean_diff) / np.std(combined_times)\n",
    "observed_effect_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we'll use this for the next 2 steps\n",
    "from statsmodels.stats.power import tt_ind_solve_power"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5239497439167554"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# what is the power of our current experiment?\n",
    "# e.g. how likely is it that correctly decided that B is better than A \n",
    "#   given the observed effect size, number of observations and alpha level we used above\n",
    "# since these are independent samples we can use tt_ind_solve_power\n",
    "# hint: the power we get should not be good\n",
    "power = tt_ind_solve_power(effect_size = observed_effect_size,  # what we just calculated\n",
    "                           nobs1 = n_A,         # the number of observations in A\n",
    "                           alpha = 0.05,        # our alpha level\n",
    "                           power = None,        # what we're interested in\n",
    "                           ratio = 1            # the ratio of number of observations of A and B\n",
    "                          )\n",
    "power"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2573.7171120434386"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# how many observations for each of A and B would we need to get a power of .9\n",
    "#   for our observed effect size and alpha level\n",
    "# eg. having a 90% change of correctly deciding B is better than A\n",
    "n_obs_A = tt_ind_solve_power(effect_size = observed_effect_size,  # what we just calculated\n",
    "                           nobs1 = None,         \n",
    "                           alpha = 0.05,         \n",
    "                           power = .9,         \n",
    "                           ratio = 1             \n",
    "                          )\n",
    "n_obs_A"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 2: Data Cleaning and Regression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Preparation and Exploration"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This data is provided by World Bank Open Data https://data.worldbank.org/, processed as in Homework 1.\n",
    "\n",
    "We will be performing regression with respect to GDP and classification with respect to Income Group.\n",
    "To do that we will first need to do a little more data prep."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 217 entries, 0 to 216\n",
      "Data columns (total 15 columns):\n",
      "country_code                           217 non-null object\n",
      "short_name                             217 non-null object\n",
      "region                                 217 non-null object\n",
      "income_group                           217 non-null object\n",
      "access_to_electricity                  217 non-null float64\n",
      "gdp                                    193 non-null float64\n",
      "population_density                     215 non-null float64\n",
      "population_total                       216 non-null float64\n",
      "unemployment                           113 non-null float64\n",
      "region_europe                          217 non-null int64\n",
      "region_latin_america_and_caribbean     217 non-null int64\n",
      "region_middle_east_and_north_africa    217 non-null int64\n",
      "region_north_america                   217 non-null int64\n",
      "region_south_asia                      217 non-null int64\n",
      "region_subsaharan_africa               217 non-null int64\n",
      "dtypes: float64(5), int64(6), object(4)\n",
      "memory usage: 25.5+ KB\n"
     ]
    }
   ],
   "source": [
    "# read in the data\n",
    "df_country = pd.read_csv('../data/country_electricity_by_region.csv')\n",
    "\n",
    "# rename columns for ease of reference\n",
    "columns = ['country_code','short_name','region','income_group','access_to_electricity','gdp','population_density',\n",
    "           'population_total','unemployment','region_europe','region_latin_america_and_caribbean',\n",
    "           'region_middle_east_and_north_africa','region_north_america','region_south_asia',\n",
    "           'region_subsaharan_africa']\n",
    "\n",
    "df_country.columns = columns\n",
    "df_country.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a dummy variable 'gdp_missing' to indicate where 'gdp' is null\n",
    "df_country['gdp_missing'] = df_country.gdp.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "income_group\n",
       "High income            18\n",
       "Low income              3\n",
       "Lower middle income     1\n",
       "Upper middle income     2\n",
       "Name: gdp, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use groupby to find the number of missing gpd by income_level\n",
    "# write a lambda function to apply to the grouped data, counting the number of nulls per group\n",
    "df_country.groupby('income_group').gdp.apply(lambda x: sum(x.isnull()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill in missing gdp values according to income_group mean\n",
    "# to do this, group by income_group \n",
    "# then apply a lambda function to the gdp column that uses the fillna function, filling with the mean\n",
    "# inplace is not available here, so assign back into the gdp column\n",
    "df_country.gdp = df_country.groupby('income_group').gdp.apply(lambda x: x.fillna(x.mean()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# assert that there are no longer any missing values in gdp\n",
    "assert sum(df_country.gdp.isnull()) == 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create 'populiation_density_missing' dummy variable\n",
    "df_country['population_density_missing'] = df_country.population_density.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill in missing population_density with median, grouping by region\n",
    "df_country.population_density = df_country.groupby('region').population_density.apply(lambda x: x.fillna(x.median()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a normalized 'gdp_zscore' column\n",
    "from scipy.stats import zscore\n",
    "df_country['gdp_zscore'] = zscore(df_country.gdp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAELCAYAAADa9kBCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3XuU3WV97/H3d2ZPkkkIREJMMRCjBi8U6y1FrW0PVmgRPeJq1UqrYmubdbxE9NjVUz1oj5bT6rHVIrYi9QYeoLq8tOhBKghK6UI0pCCQIIwSIBchBHKf2575nj/2b0/2TPbM7CQzs+eXvF9rzcrez+/5PfubyebhM888+/eLzESSJEk62nW0uwBJkiRpNjAYS5IkSRiMJUmSJMBgLEmSJAEGY0mSJAkwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAKi064VPOOGEXLFiRbteXpIOy+233/5YZi5pdx0zxTlbUpm1Ome3LRivWLGCtWvXtuvlJemwRMSD7a5hJjlnSyqzVudst1JIkiRJGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEtPHOd1PlqtseOqDtD168vA2VSJIkqcxcMZYkSZIwGEuSJEmAwViSJEkCDMaSJEkSYDCWJEmSAIOxJEmSBBiMJUmSJKCFYBwR8yLiRxFxZ0TcExEfbtJnbkR8JSJ6IuK2iFgxHcVKkiRJ06WVFeN+4Lcy83nA84GzI+IlY/q8DXgiM1cCnwQ+NrVlSpIkSdNr0jvfZWYCe4qnXcVXjul2LvC/isdfAz4dEVGcK0k6Cl1yySX84Ac/AGDZsmUj7StXrmTNmjXtKkuSxtXSLaEjohO4HVgJ/ENm3jamyzLgYYDMrEbETmAx8NgU1ipJKpGenh62PbYdOiv8or/2v5vOfY+3uSpJGl9LH77LzKHMfD5wEnB6RJw2pks0O21sQ0Ssjoi1EbF227ZtB1+tJKlcOisMzV9M77PPoffZ5zA0//h2VyRJ4zqoq1Jk5g7g+8DZYw5tAk4GiIgKcBxwwLJAZl6Wmasyc9WSJUsOqWBJkiRpOrRyVYolEbGoeNwNnAncO6bbNcD5xePXATe6v1iSJEll0soe4xOBy4t9xh3AVzPz2xHxEWBtZl4DfB74ckT0UFspfuO0VSxJkiRNg1auSvET4AVN2j/U8LgPeP3UliZJkiTNHO98J0mSJGEwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAloIxhFxckTcFBEbIuKeiLigSZ8zImJnRNxRfH1oesqVJEmSpkelhT5V4H2ZuS4iFgK3R8T1mbl+TL9/z8xXT32JkiRJ0vSbdMU4M7dm5rri8W5gA7BsuguTJEmSZtJB7TGOiBXAC4Dbmhx+aUTcGRHfiYhfnoLaJEmSpBnTylYKACLiGODrwHsyc9eYw+uAp2bmnog4B/gX4JQmY6wGVgMsX778kIuWJEmSplpLK8YR0UUtFF+Zmd8Yezwzd2XmnuLxtUBXRJzQpN9lmbkqM1ctWbLkMEuXJEmSpk4rV6UI4PPAhsz8xDh9fqnoR0ScXoy7fSoLlSRJkqZTK1spXga8GbgrIu4o2j4ALAfIzEuB1wFvj4gq0Au8MTNzGuqVJEmSpsWkwTgzbwFikj6fBj49VUVJkiRJM80730mSJEkYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkqbJ5s2bYXiopb6XXHIJl1xyyTRXJEkTa/nOd5IkHYze3l5o8cqdPT0901yNJE3OFWNJkiQJg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZIAg7EkSZIEGIwlSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMJYkSZKAFoJxRJwcETdFxIaIuCciLmjSJyLiUxHRExE/iYgXTk+5kiRJ0vSotNCnCrwvM9dFxELg9oi4PjPXN/R5JXBK8fVi4DPFn5IkSVIpTLpinJlbM3Nd8Xg3sAFYNqbbucAVWfNDYFFEnDjl1UqSJEnT5KD2GEfECuAFwG1jDi0DHm54vokDw7MkSZI0a7UcjCPiGODrwHsyc9fYw01OySZjrI6ItRGxdtu2bQdXqSRJkjSNWgrGEdFFLRRfmZnfaNJlE3Byw/OTgC1jO2XmZZm5KjNXLVmy5FDqlSRJkqZFK1elCODzwIbM/MQ43a4B3lJcneIlwM7M3DqFdUqSJEnTqpWrUrwMeDNwV0TcUbR9AFgOkJmXAtcC5wA9wD7gj6a+VEmSJGn6TBqMM/MWmu8hbuyTwDunqihJkiRppnnnO0mSJAmDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJ0ixz3nnnccYZZ3DmmWfy9re/nZ6eHt797nfT09PDO97xDlavXj2qffv27SPnbt++faRv47Hx2huPbd++fcJ+rWocb7aZzbU1Kkudmlkz8b4wGEuSZpWtW7cCUK1W2bBhAxdddBF33XUXF110EevXr+e+++4b1X7FFVeMnHv55ZeP9G08Nl5747Errrhiwn6tahxvtpnNtTUqS52aWTPxvjAYS5JmjfPOO++Ato0bN5KZbNy4sWn7ddddN7Lae9111430rR/r6elp2j72nO985zvj9mtV43gHe+50m821NSpLnZpZM/W+MBhLkmZMR98uenp6uOCCC0Z99fT0sHnz5pHV4oMxNDQ0sto7PDx8wLGLLrqoafvYcwYHBxkcHGzar1WN4x3sudNtNtfWqCx1ambN1PvCYCxJaioiVkfE2ohYu23btnaXM65qtcr111/PDTfcQLVaPeDYxo0bm7aPPSczycym/VrVON7BnjvdZnNtjcpSp2bWTL0vDMaSpKYy87LMXJWZq5YsWTIlYw7PO5aVK1dy8cUXj/pauXIly5YtO6QxK5UKZ511FmeeeSaVSuWAYytWrGjaPvaciCAimvZrVeN4B3vudJvNtTUqS52aWTP1vjAYS5JmjRNPPPGgz+ns7OQtb3kL559/Ph0dHQccu/DCC5u2jz2nq6uLrq6upv1a1TjewZ473WZzbY3KUqdm1ky9LwzGkqRZ4+qrrz6gbcWKFUQEK1asaNp+9tlns3jxYhYvXszZZ5890rd+bOXKlU3bx57zyle+ctx+rWoc72DPnW6zubZGZalTM2um3hcGY0nSrFJfNa5UKjznOc/hwgsv5LnPfS4XXnghp556Ks985jNHtTeuHJ1//vkjfRuPjdfeeKy+gjxev1Y1jjfbzObaGpWlTs2smXhfxNgPGsyUVatW5dq1aw97nKtue+iAtj948fLDHleSJhIRt2fmqnbXMVMOZc5+1atexZ69+xhauJTeZ58DQPe91/Kipy/l4osvHtX3ggsuADigXZKmQqtztivGkiRJEgZjSZIkCWghGEfEFyLi0Yi4e5zjZ0TEzoi4o/j60NSXKUmSJE2vyuRd+BLwaWCiW4z8e2a+ekoqkiRJktpg0hXjzLwZeHwGapEkSZLaZqr2GL80Iu6MiO9ExC9P0ZiSJEnSjGllK8Vk1gFPzcw9EXEO8C/AKc06RsRqYDXA8uVeUk2SJEmzx2GvGGfmrszcUzy+FuiKiBPG6XtZZq7KzFVLliw53JeWJEmSpsxhB+OI+KWIiOLx6cWY2w93XEmSJGkmTbqVIiKuBs4AToiITcBfAl0AmXkp8Drg7RFRBXqBN2a7bqcnSZIkHaJJg3FmnjfJ8U9Tu5ybJEmSVFre+U6SJEnCYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiTAYCxJkiQBBmNJkiQJMBhLkiRJgMFYkiRJAgzGkiRJEmAwliRJkgCDsSRJkgQYjCVJkiSg5MH4J5t28Pc33Me+gWq7S5EkSVLJlToY37t1N4/u7mfzE73tLkWSJEklV+pg3F8dAuDR3f1trkSSJEllV/JgPAzAo7v72lyJJEmSyu6ICMaP7HLFWJIkSYen3MF4sL6Voo/MbHM1kiRJKrNJg3FEfCEiHo2Iu8c5HhHxqYjoiYifRMQLp77M5uorxn2Dw+zu88oUkiRJOnStrBh/CTh7guOvBE4pvlYDnzn8slpTD8bgB/AkSZJ0eCYNxpl5M/D4BF3OBa7Imh8CiyLixKkqcCL91SE6OwLwA3iSJEk6PFOxx3gZ8HDD801F27TrHxzm2HkVurs6/QCeJEmSDktlCsaIJm1NPwkXEaupbbdg+fLlh/3C/dVhKp0dLO3ucsVYkmaZ7u5u9uxr7QZMK1eunOZqJGlyUxGMNwEnNzw/CdjSrGNmXgZcBrBq1arDvoxEf3WIro7gyQvncdfmnWQmEc1yuiRppi1btoxtT+xsqe+aNWumuRpJmtxUbKW4BnhLcXWKlwA7M3PrFIw7qfqK8ZOPnUvv4BB7+r0yhSRJkg7NpCvGEXE1cAZwQkRsAv4S6ALIzEuBa4FzgB5gH/BH01XsWP2Dw1SKFWOo3ehj4byumXp5SZIkHUEmDcaZed4kxxN455RVdBD6q0NUOoPFC+YAsGPfQDvKkCRJ0hGg3He+qw5T6ehgYXct3+/sG2xzRZIkSSqr8gfjzqDS0cGCOZ3s7nWPsSRJkg5NuYPx4BBdHbW/wrHdXexyxViSJEmHqNzBuFgxBjh2Xhe7eg3GkiRJOjTlD8bFLaGP7e5iZ59bKSRJknRoSh6Mh6h01rdSVNjbX6U6PNzmqiRJklRGpQ3GQ8PJ4FCObKU4rrh+8W5XjSVJknQIShuMB6q1leH6h+/qN/Zwn7EkSZIORWmDcX91CGD/h++KaxnvcsVYkiRJh6DEwbi2YlwpVoyPc8VYkiRJh6G8wXiwCMbFinH3nE4qHWEwliRJ0iEpbzCub6UoLtcWEcUl2wzGkiRJOnglDsbFh+869/8Vjp1XYZe3hZYkSdIhKHEwHr1iDK3fFjozp60uSZIklVN5g/HIHuPGFePabaEnCr7rt+ziWR+8jocf3zftNUqSJKk8yhuMR65KMXrFuDqc7JzgA3j3P7qbgeowD243GEuSJGm/Egfj0dcxhtoeY4BHdvWPe149NO/p90N6kiRJ2q/EwXj0ne8AjuuuXcv4F7v6xj1vx75aIPbW0ZIkSWpU3mA85jrGUNtjDPDIzvGDcX3F2GAsSZKkRuUNxiNbKfb/FRYWWym2ThCM6yvGe/oNxpIkSdqvxMH4wA/fVTo7OK67i4cmuOLEzt4BAHZ7IxBJkiQ1KH8wbthKAXD8gjk8uH3vuOft//CdK8aSJEnar7zBeHCICOiM0cF48YI5bJzgUmz1rRS73GMsSZKkBuUNxtVh5lY6iCbB+LE9/eOuCO+orxgbjCVJktSgpWAcEWdHxE8joici/qLJ8bdGxLaIuKP4+pOpL3W0WjDuPKD9+GPmAjTdTpGZDVelcI+xJEmS9ps0GEdEJ/APwCuBU4HzIuLUJl2/kpnPL74+N8V1HqC/OsTcyoHlL14wB6Dpne36BocZKPYmu8dYkiRJjVpZMT4d6MnMn2fmAPDPwLnTW9bk+geHmds1fjDe2GTFeEdxRYoIr2MsSZKk0VoJxsuAhxuebyraxvq9iPhJRHwtIk5uNlBErI6ItRGxdtu2bYdQ7n7jbaWY29XJCcfM5cHHDlwxrm+jWLpwnnuMJUmSNEorwTiatOWY598CVmTmrwA3AJc3GygzL8vMVZm5asmSJQdX6RjjbaUAWLF4fvMV4+KKFCc9qZs9A1WGh8f+NSRJknS0aiUYbwIaV4BPArY0dsjM7ZnZXzz9J+BFU1Pe+OpXpWjmqYsXNL3JR2MwzoS9A64aS5IkqaaVYPxj4JSIeFpEzAHeCFzT2CEiTmx4+hpgw9SV2Fz/YPOtFFBbMd66s4++waFR7bt668F4PuAH8CRJkrTfpME4M6vAu4B/oxZ4v5qZ90TERyLiNUW3d0fEPRFxJ/Bu4K3TVXBdf3Wo6YfvAJYvrgXfsavG9Q/fnXx8N+AH8CRJkrRfpZVOmXktcO2Ytg81PH4/8P6pLW1iE22lWLF4AQAbH9vLM5cuHGnfsW+Qzo5g6bHzAIOxJEmS9iv5ne/G20pRC8aN1zK+6raHWPvgE8yrdPDDn20H4Ft3bml6viRJko4+5Q3Gg+NfleK4+V0smt91wJUpegeG6J5TYW5XLVCP3YMsSZKko1d5g3G1+Q0+6p5+wgLuf2TPqLbegSG6uzqYVwTj/sHhaa1RkiRJ5VHuYDzOVgqA05Ydxz1bdo66VnHv4BDz51RGVpr7qq4YS5IkqabEwXj8rRRQC8Z7B4ZGbafYN1Cle04ncyodBNDnirEkTa+hKp37ttN977V033stnfseb3dFkjSulq5KMdsMDSeDQznxivFTjgPgrs07efqSY4DainF3VycdEcypdNDvirEkTZuVK1eyefNmAJYtW1q0LmXlypXtK0qSJlDKYDxQra30TrTH+JSlxzCn0sE9W3Zx7vOXMZxJ3+Aw3XNqYXpeV6d7jCVpGq1Zs4Y1a9a0uwxJalkpg3F9pXe8rRRX3fYQAE9eOJcb1j/CisUL6BuonTO/CMZzKx3uMZYkSdKIUu4x7q+vGE+wlQLgKcd1s2VnL5nJvuLSbN1drhhLkiTpQOUMxoP1YDxx+csWddM3OMwT+wbpLVaM92+lcMVYkiRJ+5UzGNe3UkywxxjgKYu6Adi8o5feYsV4fld9K0WnV6WQJEnSiJIG49a2Uiw9di6dEWzZ0cu+YsV4XsOKcb93vpMkSVLhiPzwXV2ls4Olx87lni072bKjF4D5c2p/5XmVTrdSSJIkaUQ5V4xb3GMM8OwTj+XxvQM88Nhenrxw7siH7+Z2dTA4lAwOuZ1CkiRJpV0xrl/HeOKtFABnPmcpZz5n6QHt84pz9/ZXWTR/ztQWKEmSpNIp54pxi1spJlLfn7y7rzolNUmSJKncyr1iXOngA9+864DjcysBQHUoOfn4+SO3hIbaCvIHvnkXpy07FqgF409efx/vPeuZfPL6+0b61Z+/96xnjhq73udg2uttja8ztl+rDufc6TRb66qb7fVp5vhekCSNp5wrxoMTb6Xoryb91WQoYeP2fdx476MjX3V3b94FwO6+QS7+3v0AXPy9+0e+6s/HajzeavvYx836tepwzp1Os7Wuutlen2aO7wVJ0njKGYynYCtF3Z5+t1JIkiSptMG49atSTGZn7+BhjyFJkqTyK3kwnvyqFJPx16qSJEmCsgbjwSEioKszDnusR3b1AfB33/3pqPb3fuWOwx5bkiRJ5VG6YFwdGuaWnsdYvGAOEYcfjN/wopMB+PSNPaPav/mfmw+5von0Dni3PUmSpNmopWAcEWdHxE8joici/qLJ8bkR8ZXi+G0RsWKqC6371PfuZ91DO/jgq0+dkvGefWLtsm3v++1njWr/pWPnjXq+u2+QL/7HAyPP+wb3B9zP37K//eofPQRAZnLjvY+MtG/YWrsKxn/5+E1TUvdsMjSc7S5BkiTpsE0ajCOiE/gH4JXAqcB5ETE2lb4NeCIzVwKfBD421YUC3Pqz7VxyUw+ve9FJnPv8ZVM69vELRt/97m2//jQA7nh4B9+5ayu/9jc38uFvrR85/hsfu4nz/umH/NbffZ+/+vb+9r++9l527Bvgo9fdyx9/aS0Af/Xt9bzh0ltHjf/6S2/liw2BejJP7B3gextqQXvtxsfJnB1htG9wiHdceTtQ+6Flttmxb4BPFNtktuzobXM1aqfM5IHH9gLw379yB3dv3tnmiiRJs00rK8anAz2Z+fPMHAD+GTh3TJ9zgcuLx18DXhFTsc+hQWbyt9/9KU9bvIAPv+aXp3Lops578XIA3nXVOt5+5TpWLj2Gd5zxjJHj1eFh1m58nN6BIc541pKR9r7BId7w2Vv57A9+zu++sBbeP3/LA5y4qLYC/c6XrwRq4fZTN97PzfdtG7X6PNa+gSqX3fwzfvPjN/G2y2tB+3WX3sr5X/wxm57YN7V/6YPQNzjEfz70BOf90w/57vpaYP/E9ffxme//bNaE9tsffJxzLv53PlVsk/m1j97IGy+7lVvuf2zW1KiJVYeGR/1bDVSHR21H6hscYsuOXgaKD+T2DQ6xfssuHtq+j+Hh5Im9A3z1xw/zzqvW8dK/uZGX/+33Abj27q28/tJbuWH9I0iSVNfKne+WAQ83PN8EvHi8PplZjYidwGLgsakoEiAi+ML5v8pje/tZMHf6b9h3zR1bANj0RC8vOHkRr33+Mro69/8c8WfF1ot6/v/+T7cBcPrTjue2Bx7ntGXH8cLlT+Ib6zZzwStOYdH8Lj78rfUjY/zpbzydr63bxFu+8CMiYOnCeQwODbOnv8qcSgcL51bYOzA0cjm5lz9rCc9Ycgyfu+UBXvXcE7l+wyOc8fHvs2h+FwPVYbo6O5hX3PCkvzpEZu1ydp2dwWA1GSz2Pu//cSXoiNrzjgg6IkYeR0AmDGeSWfuhJNn/fDhrK7HV4WRupYPzfnU5V/3oIZ530nF87Lp7+T/X3Uv3nE66uzqZ19VJZ0eQ1MeqvfrI2Owfk3ofaq853PDa1OuBUeeNetwwbt3xC+bwjjOewT9+/2f89qlLuXPTDt70+dtYOK/CnM4OKp1BpaP2Z/284hVH1UtRU6PGn/3qD0f+JEYe7/8+7h+nPlIU40Txb1E/r/GcbPh7A8W/W23wjo7m5+z/t8rinBg5r/HfeTiT4eHiz+J7PjycdHQEczo7iKiF0aHhZE6l9n0arCYDQ8NUOoK5XR0MDSW9g0NEBPMqHXR0BL0DQwwMDdPd1cncrg56B4bpHai9t+fPqVAdHmZPX5XhhGPmVejqCHb1VekdHGLBnE4Wzutid98gu/qqzOns4EkLuhgcSh7fOwDAwrkVuud0sm1PP5m1793iBXN5fG8/9Z098+d00jc4xHDCsfMqrDhhAac/7XiuuXMLN//5y/mTy9fyp19ey/9+7XP5g+IHYUnS0S0mWzmLiNcDv5OZf1I8fzNwemauaehzT9FnU/H8Z0Wf7WPGWg2sLp4+Cxh9KYipdQJTGMxnUBnrLmPNUM66y1gzlLPuyWp+amYumeD4ESUitgEPTvGwZXxf1Fn7zCtr3WDt7TC27pbm7FaWXjcBJzc8PwnYMk6fTRFRAY4DHh87UGZeBlzWwmsetohYm5mrZuK1plIZ6y5jzVDOustYM5Sz7jLWPJ2m44eAMn+PrX3mlbVusPZ2ONS6W9lj/GPglIh4WkTMAd4IXDOmzzXA+cXj1wE3pps4JUmSVCKTrhgXe4bfBfwb0Al8ITPviYiPAGsz8xrg88CXI6KH2krxG6ezaEmSJGmqtfQptsy8Frh2TNuHGh73Aa+f2tIO24xs2ZgGZay7jDVDOesuY81QzrrLWHPZlPl7bO0zr6x1g7W3wyHVPemH7yRJkqSjQeluCS1JkiRNhyMyGE92C+vZJiJOjoibImJDRNwTERe0u6aDERGdEfGfEfHtdtfSiohYFBFfi4h7i+/5S9tdUysi4r3F++PuiLg6IuZNftbMi4gvRMSjEXF3Q9vxEXF9RNxf/PmkdtY41jg1f7x4j/wkIr4ZEYvaWeORpmzzNJR/robyzdd1ZZ23oTxzN5Rz/oapncOPuGDc4i2sZ5sq8L7MfA7wEuCdJai50QXAhnYXcRAuBq7LzGcDz6MEtUfEMuDdwKrMPI3aB2Fn64dcvwScPabtL4DvZeYpwPeK57PJlziw5uuB0zLzV4D7gPfPdFFHqpLO01D+uRrKN1/XlW7ehtLN3VDO+RumcA4/4oIxrd3CelbJzK2Zua54vJvaf/DL2ltVayLiJOBVwOfaXUsrIuJY4DepXUmFzBzIzB3traplFaC7uFb4fA68nviskJk3c+B1zBtvG3858NoZLWoSzWrOzO9mZrV4+kNq13DX1CjdPA3lnquhfPN1XcnnbSjJ3A3lnL9haufwIzEYN7uFdZkmrhXAC4Db2ltJy/4e+HNguN2FtOjpwDbgi8WvEz8XEQvaXdRkMnMz8LfAQ8BWYGdmfre9VR2UpZm5FWrhAnhym+s5WH8MfKfdRRxBSj1PQynnaijffF1Xynkbjoi5G8o/f8NBzOFHYjCOJm2luPRGRBwDfB14T2buanc9k4mIVwOPZubt7a7lIFSAFwKfycwXAHuZnb8WGqXY03Uu8DTgKcCCiHhTe6s6OkTE/6T2K/Qr213LEaS08zSUb66G0s7XdaWct8G5ezY42Dn8SAzGrdzCetaJiC5qE+2VmfmNdtfTopcBr4mIjdR+FfpbEfF/21vSpDYBmzKzvsrzNWoT7mx3JvBAZm7LzEHgG8Cvtbmmg/FIRJwIUPz5aJvraUlEnA+8GvhD7+Y5pUo5T0Np52oo53xdV9Z5G8o/d0NJ5284tDn8SAzGrdzCelaJiKC2d2pDZn6i3fW0KjPfn5knZeYKat/nGzNzVv8knJm/AB6OiGcVTa8A1rexpFY9BLwkIuYX75dXUJIPnxQabxt/PvCvbaylJRFxNvA/gNdk5r5213OEKd08DeWdq6Gc83VdiedtKP/cDSWcv+HQ5/AjLhgXG63rt7DeAHw1M+9pb1WTehnwZmo/wd9RfJ3T7qKOYGuAKyPiJ8Dzgb9ucz2TKlZKvgasA+6i9t/urLwbUURcDdwKPCsiNkXE24CPAmdFxP3AWcXzWWOcmj8NLASuL/6bvLStRR5BSjpPg3N1O5Vu3oZyzd1QzvkbpnYO9853kiRJEkfgirEkSZJ0KAzGkiRJEgZjSZIkCTAYS5IkSYDBWJIkSQIMxpIkSRJgMNYsExErIuLudtchSZqcc7aONAZjqUUR0dnuGiRJrXHO1qEwGGtGRcRgBaRaAAADAUlEQVQHI+LeiLg+Iq6OiD+LiBdFxJ0RcSvwzoa+b42If42I6yLipxHxlxOM+98a7kT1QETcFBGdEfGliLg7Iu6KiPcWfVdGxA3Fa66LiGdEzccb+v5+0feMYqyrqN21iIh4U0T8qHitzzr5SjpSOWfraFNpdwE6ekTEKuD3gBdQe++tA24HvgisycwfRMTHx5x2OnAasA/4cUT8v8xcO3bszLwUuDQiuoAbgU9Qu23ossw8rXj9RUX3K4GPZuY3I2IetR8Qf7fo/zzghOK1bm6sITMfiIjnAL8PvCwzByPiH4E/BK443O+PJM0mztk6GrlirJn068C/ZmZvZu4GvgUsABZl5g+KPl8ec871mbk9M3uBbxRjTORi4MbM/Bbwc+DpEXFJRJwN7IqIhdQm3m8CZGZfZu4rxr06M4cy8xHgB8CvFmP+KDMfKB6/AngRtUn4juL50w/lmyFJs5xzto46rhhrJkWTtr1ATnDO2GPj9o2ItwJPBd4FkJlPRMTzgN+h9uu+NwDvOYjaGmts7Hd5Zr5/gv6SdCRwztZRxxVjzaRbgP8aEfMi4hjgVUX7zoioryr84ZhzzoqI4yOiG3gt8B/NBo6IFwF/BrwpM4eLthOAjsz8OvBB4IWZuQvYFBGvLfrMjYj5wM3A7xd73JYAvwn8qMlLfQ94XUQ8uTj/+Ih46iF8LyRptnPO1lHHFWPNmMz8cURcA9wJPAisBXYCfwR8ISL2Af825rRbqP2qbiVwVbO9aoV3AccDN0UExdiXAF+MiPoPgPUVgzcDn42IjwCDwOuBbwIvLWpL4M8z8xcR8ewxf4f1EXEh8N1i3EFqKxsPHuz3Q5JmM+dsHY0ic6LfiEhTKyKOycw9DT/xr87MdeP0fSuwKjPfNZM1SpJqnLN1tHHFWDPtsog4FZhHbd9X0wlWkjQrOGfrqOKKsUolIhZT2zM21isyc/tM1yNJGp9ztsrGYCxJkiThVSkkSZIkwGAsSZIkAQZjSZIkCTAYS5IkSYDBWJIkSQLg/wNvAMWu3LMAjwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 864x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# use seaborn to create a distplot (with rugplot indicators) and a boxplot of gdp_zscores to visualize outliers\n",
    "fig, ax = plt.subplots(1,2,figsize=(12,4))\n",
    "_ = sns.distplot(df_country.gdp_zscore,rug=True, ax=ax[0])\n",
    "_ = sns.boxplot(df_country.gdp_zscore, ax=ax[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_code</th>\n",
       "      <th>gdp_zscore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>USA</td>\n",
       "      <td>11.552402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>CHN</td>\n",
       "      <td>5.807531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>JPN</td>\n",
       "      <td>4.035723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>DEU</td>\n",
       "      <td>2.365951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>FRA</td>\n",
       "      <td>1.717156</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>GBR</td>\n",
       "      <td>1.626685</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>BRA</td>\n",
       "      <td>1.479186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>89</th>\n",
       "      <td>IND</td>\n",
       "      <td>1.264916</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>95</th>\n",
       "      <td>ITA</td>\n",
       "      <td>1.201040</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>CAN</td>\n",
       "      <td>1.007785</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    country_code  gdp_zscore\n",
       "203          USA   11.552402\n",
       "36           CHN    5.807531\n",
       "98           JPN    4.035723\n",
       "50           DEU    2.365951\n",
       "64           FRA    1.717156\n",
       "68           GBR    1.626685\n",
       "26           BRA    1.479186\n",
       "89           IND    1.264916\n",
       "95           ITA    1.201040\n",
       "32           CAN    1.007785"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print the top 10 country_code and gdp_zscore sorted by gdp_zscore\n",
    "df_country[['country_code','gdp_zscore']].sort_values(by='gdp_zscore', ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set a zscore cutoff to remove the top 4 outliers\n",
    "gdp_zscore_cutoff = 2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a normalized 'population_density_zscore' column\n",
    "df_country['population_density_zscore'] = zscore(df_country.population_density)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country_code</th>\n",
       "      <th>population_density_zscore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>MAC</td>\n",
       "      <td>9.660487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>MCO</td>\n",
       "      <td>9.474560</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>SGP</td>\n",
       "      <td>3.703655</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>HKG</td>\n",
       "      <td>3.287870</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>GIB</td>\n",
       "      <td>1.512029</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>BHR</td>\n",
       "      <td>0.662775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>MDV</td>\n",
       "      <td>0.461115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>129</th>\n",
       "      <td>MLT</td>\n",
       "      <td>0.460529</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>BMU</td>\n",
       "      <td>0.443888</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>BGD</td>\n",
       "      <td>0.404138</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    country_code  population_density_zscore\n",
       "118          MAC                   9.660487\n",
       "121          MCO                   9.474560\n",
       "170          SGP                   3.703655\n",
       "82           HKG                   3.287870\n",
       "71           GIB                   1.512029\n",
       "19           BHR                   0.662775\n",
       "124          MDV                   0.461115\n",
       "129          MLT                   0.460529\n",
       "24           BMU                   0.443888\n",
       "17           BGD                   0.404138"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print the top 10 country_code and population_density_zscore sorted by population_density_zscore\n",
    "df_country[['country_code','population_density_zscore']].sort_values(by='population_density_zscore',ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set a zscore cutoff to remove the top 5 outliers\n",
    "population_density_zscore_cutoff = 1.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop outliers (considering both gdp_zscore and population_density_zscore)\n",
    "df_country = df_country[(df_country.gdp_zscore < gdp_zscore_cutoff) & (df_country.population_density_zscore < population_density_zscore_cutoff)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train a Regression Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create the training set of X with features (population_density, access_to_electricity) \n",
    "# and labels y (gdp)\n",
    "X = df_country[['population_density','access_to_electricity']].values\n",
    "y = df_country['gdp'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import and initialize a LinearRegression model using default parameters\n",
    "from sklearn.linear_model import LinearRegression\n",
    "lr = LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# train the regressor on X and y\n",
    "lr.fit(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-124778701696.41553\n",
      "93890100.76729578\n",
      "4352251616.858018\n"
     ]
    }
   ],
   "source": [
    "# print out the learned intercept and coefficients\n",
    "print(lr.intercept_)\n",
    "print(lr.coef_[0])\n",
    "print(lr.coef_[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can use this mask to easily index into our dataset\n",
    "country_mask = (df_country.country_code == 'CAN').values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1.46879775e+12])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# how far off is our model's prediction for Canada's gdp (country_code CAN) from it's actual gdp?\n",
    "lr.predict(X[country_mask]) - y[country_mask]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a new training set X that, in addition to population_density and access_to_electricity,\n",
    "# also includes the region_* dummies\n",
    "X = df_country[['population_density','access_to_electricity','region_europe','region_latin_america_and_caribbean',\n",
    "           'region_middle_east_and_north_africa','region_north_america','region_south_asia',\n",
    "           'region_subsaharan_africa']].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# instantiate a new model and train, with fit_intercept=False\n",
    "lr = LinearRegression(fit_intercept=False).fit(X,y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-5.30005884e+11])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# did the prediction for CAN improve?\n",
    "lr.predict(X[country_mask]) - y[country_mask]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py36",
   "language": "python",
   "name": "py36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
